EDA

#Load from CSV file
data_file_location = "/Users/ravisivaraman/Downloads/bank-additional/bank-additional-full.csv"

bank_Data <- read.csv2(data_file_location, header = TRUE, sep = ";")

# Convert response variable to a factor
bank_Data$y<-as.factor(bank_Data$y)

# Check summary of original data
summary(bank_Data)
##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome         emp.var.rate      
##  Min.   :  0.0   Min.   :0.000   Length:41188       Length:41188      
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   Class :character  
##  Median :999.0   Median :0.000   Mode  :character   Mode  :character  
##  Mean   :962.5   Mean   :0.173                                        
##  3rd Qu.:999.0   3rd Qu.:0.000                                        
##  Max.   :999.0   Max.   :7.000                                        
##  cons.price.idx     cons.conf.idx       euribor3m         nr.employed       
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    y        
##  no :36548  
##  yes: 4640  
##             
##             
##             
## 
# Several "Unknown" or missing data. Assigning all unknown as NA 
bank_Data[bank_Data=="unknown"] <- NA
sum(is.na(bank_Data))
## [1] 12718
# Removing all the NAs
bank.nona = na.omit(bank_Data)
sum(is.na(bank.nona))
## [1] 0
# Since the data is imbalance and the number of "Yes" for the response variable are very small compared to "No". Downsampling the data to make it balanced.
set.seed(12345)
bank.ds <-downSample(bank.nona, y=bank.nona$y)
summary(bank.ds)
##       age            job              marital           education        
##  Min.   :17.00   Length:7718        Length:7718        Length:7718       
##  1st Qu.:31.00   Class :character   Class :character   Class :character  
##  Median :36.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :39.55                                                           
##  3rd Qu.:46.00                                                           
##  Max.   :89.00                                                           
##    default            housing              loan             contact         
##  Length:7718        Length:7718        Length:7718        Length:7718       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign    
##  Length:7718        Length:7718        Min.   :   2.0   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.: 142.0   1st Qu.: 1.00  
##  Mode  :character   Mode  :character   Median : 257.0   Median : 2.00  
##                                        Mean   : 375.2   Mean   : 2.28  
##                                        3rd Qu.: 499.0   3rd Qu.: 3.00  
##                                        Max.   :4199.0   Max.   :43.00  
##      pdays        previous        poutcome         emp.var.rate      
##  Min.   :  0   Min.   :0.0000   Length:7718        Length:7718       
##  1st Qu.:999   1st Qu.:0.0000   Class :character   Class :character  
##  Median :999   Median :0.0000   Mode  :character   Mode  :character  
##  Mean   :881   Mean   :0.3332                                        
##  3rd Qu.:999   3rd Qu.:0.0000                                        
##  Max.   :999   Max.   :6.0000                                        
##  cons.price.idx     cons.conf.idx       euribor3m         nr.employed       
##  Length:7718        Length:7718        Length:7718        Length:7718       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Class     
##  no :3859  
##  yes:3859  
##            
##            
##            
## 

Plotting Predictors in the model

# Splitting into Categorical variables
bank.Fact <- bank.ds[, c(2,3,4,6,7,8,9,10,15,21)]
bank.Fact1 <- bank.ds[, c(2,3,4,6,21)]
bank.Fact2 <- bank.ds[, c(7,8,9,10,15,21)]

# Splitting into Continous variables

bank.Cont <- bank.ds[, c(1,11,12,13,14,16,17,18,19,20)]
bank.Cont1 <- bank.ds[, c(1,11,12,13,14,21)] # Adding Response variable also
bank.Cont2 <- bank.ds[, c(16,17,18,19,20,21)]  # Adding Response variable also

# Categorical

bank_cat1 = bank.ds[,2:10]

str(bank.ds)
## 'data.frame':    7718 obs. of  21 variables:
##  $ age           : int  37 34 31 41 36 40 28 32 28 33 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 1 1 1 10 8 5 10 10 11 10 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 3 2 2 2 3 3 3 1 3 3 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 7 4 7 6 4 7 6 6 4 6 ...
##  $ default       : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 2 2 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 2 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 1 1 1 1 1 2 1 2 1 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 2 4 8 9 4 7 7 2 7 4 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 1 5 4 2 1 5 5 1 4 ...
##  $ duration      : int  220 59 596 304 719 52 229 11 109 312 ...
##  $ campaign      : int  2 7 2 2 2 1 2 12 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.4 -2.9 -0.1 -3.4 1.4 -1.8 1.1 1.4 1.1 1.4 ...
##  $ cons.price.idx: num  93.4 92.5 93.2 92.4 93.9 ...
##  $ cons.conf.idx : num  -36.1 -33.6 -42 -26.9 -42.7 -46.2 -36.4 -36.1 -36.4 -42.7 ...
##  $ euribor3m     : num  4.965 1.059 4.12 0.744 4.962 ...
##  $ nr.employed   : num  5228 5076 5196 5018 5228 ...
##  $ Class         : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
pairs(bank1)

bank.ds$y=bank.ds$Class
attach(bank.ds)
aggregate(y~age,data=bank.ds,summary)
##    age y.no y.yes
## 1   17    1     0
## 2   18    1     6
## 3   19    3    10
## 4   20    5    16
## 5   21    9    23
## 6   22   13    32
## 7   23   28    45
## 8   24   42    67
## 9   25   61    74
## 10  26   80   114
## 11  27   72    96
## 12  28   82   137
## 13  29  145   173
## 14  30  185   179
## 15  31  211   196
## 16  32  188   158
## 17  33  215   187
## 18  34  181   167
## 19  35  205   146
## 20  36  191   134
## 21  37  159   117
## 22  38  122   114
## 23  39  122    96
## 24  40  113    70
## 25  41  124    89
## 26  42  120    78
## 27  43  101    73
## 28  44   88    55
## 29  45   91    61
## 30  46   93    62
## 31  47   77    46
## 32  48   77    86
## 33  49   68    37
## 34  50   59    71
## 35  51   69    56
## 36  52   55    61
## 37  53   58    53
## 38  54   63    51
## 39  55   47    42
## 40  56   48    63
## 41  57   48    45
## 42  58   37    51
## 43  59   34    49
## 44  60   18    47
## 45  61    4    28
## 46  62    2    18
## 47  63    2    12
## 48  64    1    22
## 49  65    4    17
## 50  66    6    22
## 51  67    0     8
## 52  68    1    15
## 53  69    2    14
## 54  70    3    17
## 55  71    3    18
## 56  72    4    11
## 57  73    2    11
## 58  74    1    12
## 59  75    2     8
## 60  76    2    14
## 61  77    0     8
## 62  78    3     9
## 63  79    1     7
## 64  80    1    15
## 65  81    1     5
## 66  82    2     9
## 67  83    1     8
## 68  84    0     2
## 69  85    2     2
## 70  86    0     2
## 71  87    0     1
## 72  88    0     9
## 73  89    0     2
plot(age~y,col=c("red","blue"))

plot(campaign~y, col=c("red","blue"))

plot(duration~y, col=c("red","blue"))

plot(emp.var.rate~y, col=c("red","blue"))

plot(cons.conf.idx~y, col=c("red","blue"))

ggplot(bank.ds,aes(x = month, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds,aes(x = marital, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds,aes(x = education, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds,aes(x = default, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds,aes(x = poutcome, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds,aes(x = day_of_week, fill = y)) + geom_bar(position = "dodge")

ggplot(bank.ds, aes(x = y, fill = marital)) + geom_density() + labs(title = " Distribution by Marital Status")

#
## All Default values are NO

summary(bank.ds$default)
##   no 
## 7718

Aggregate

## Aggregates
aggregate(y~marital,data=bank.ds,summary)
##    marital y.no y.yes
## 1 divorced  453   410
## 2  married 2254  2056
## 3   single 1152  1393
aggregate(y~month,data=bank.ds,summary)
##    month y.no y.yes
## 1    apr  249   468
## 2    aug  607   533
## 3    dec    6    74
## 4    jul  613   512
## 5    jun  455   452
## 6    mar   24   246
## 7    may 1329   700
## 8    nov  471   365
## 9    oct   57   287
## 10   sep   48   222
aggregate(y~default,data=bank.ds,summary)
##   default y.no y.yes
## 1      no 3859  3859
aggregate(y~education,data=bank.ds,summary)
##             education y.no y.yes
## 1            basic.4y  292   326
## 2            basic.6y  200   136
## 3            basic.9y  548   380
## 4         high.school  984   934
## 5          illiterate    1     3
## 6 professional.course  551   538
## 7   university.degree 1283  1542
aggregate(y~poutcome,data=bank.ds,summary)
##      poutcome y.no y.yes
## 1     failure  458   508
## 2 nonexistent 3334  2572
## 3     success   67   779
## GGpairs

ggpairs(bank1,aes(colour=y))

ggpairs(bank2,aes(colour=y))

ggpairs(bank_cat1,aes(colour=y))

Heatmaps

####### PCA ##########
# 
# pca1<-prcomp(bank2,scale.=TRUE)
# plot(prcomp(bank2))
# pc.scores<-pca1$x
# pairs(pc.scores)
# cor(pc.scores)
# 
# 
# ######
# 
# 
# prop.table(table(y,month),2)
# plot(y~month,col=c("red","blue"))


### Correlation test for factors and Continous variables


model.matrix(~0+., data=bank.Fact1) %>%
  cor() %>%
  ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=2.5, insig = "blank")+
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 8), axis.text.y = element_text(hjust = 1,size = 8))

model.matrix(~0+., data=bank.Fact2) %>%
  cor() %>%
  ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=2.5, insig = "blank")+
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 8), axis.text.y = element_text(hjust = 1,size = 8))

model.matrix(~0+., data=bank.Cont1) %>%
  cor() %>%
  ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=3, insig = "blank")+
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), axis.text.y = element_text(hjust = 1,size = 10))

model.matrix(~0+., data=bank.Cont2) %>%
  cor() %>%
  ggcorrplot(show.diag = F, type="upper", lab=TRUE, lab_size=3, insig = "blank")+
  theme(axis.text.x = element_text(angle = 60, hjust = 1,size = 10), axis.text.y = element_text(hjust = 1,size = 10))

Feature Selection

#Setup Data

bank_additional = read.csv(data_file_location, sep=";")


#Cleanup; remove na's 
bank_additional[bank_additional=="unknown"] <- NA
bank_additional.nona = na.omit(bank_additional)

#Make columns as factors
bank_additional.nona$contact = as.factor(bank_additional.nona$contact)
bank_additional.nona$poutcome = as.factor(bank_additional.nona$poutcome)
bank_additional.nona$day_of_week = as.factor(bank_additional.nona$day_of_week)
bank_additional.nona$month = as.factor(bank_additional.nona$month)
bank_additional.nona$loan = as.factor(bank_additional.nona$loan)
bank_additional.nona$housing = as.factor(bank_additional.nona$housing)
bank_additional.nona$default = as.factor(bank_additional.nona$default)
bank_additional.nona$education = as.factor(bank_additional.nona$education)
bank_additional.nona$marital = as.factor(bank_additional.nona$marital)
bank_additional.nona$job = as.factor(bank_additional.nona$job)
bank_additional.nona$y = as.factor(bank_additional.nona$y)


#Down Sample
bankadditional_ds = downSample(bank_additional.nona, y=bank_additional.nona$y)
nrow(bankadditional_ds)
## [1] 7718
#Remove Default column from the dataframe (and y, it is copied as Class)
bankaddition_ds_nodef = within(bankadditional_ds, rm(default))


#Setup test and train dataset
smp_size <- floor(0.75 * nrow(bankaddition_ds_nodef))

set.seed(123)
train_bankdata <- sample(seq_len(nrow(bankaddition_ds_nodef)), size = smp_size)

train_bankdata_ds = bankaddition_ds_nodef[train_bankdata,]
test_bankdata_ds = bankaddition_ds_nodef[-train_bankdata,]
str(train_bankdata_ds)
## 'data.frame':    5788 obs. of  20 variables:
##  $ age           : int  70 33 35 56 37 32 36 32 29 46 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 2 4 1 10 2 8 10 1 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 1 2 3 2 2 3 2 3 3 2 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 3 1 7 4 1 4 6 4 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 1 2 1 2 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 1 2 1 2 2 2 2 1 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 10 7 7 5 3 7 5 7 4 10 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 4 1 5 5 2 5 5 2 3 5 ...
##  $ duration      : int  161 437 143 204 206 202 357 158 575 408 ...
##  $ campaign      : int  1 2 1 1 2 1 4 3 4 1 ...
##  $ pdays         : int  999 999 999 999 6 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 3 2 2 2 2 2 ...
##  $ emp.var.rate  : num  -3.4 1.1 -1.8 1.4 -3 1.1 1.4 1.1 1.4 -3.4 ...
##  $ cons.price.idx: num  92.4 94 92.9 94.5 92.7 ...
##  $ cons.conf.idx : num  -29.8 -36.4 -46.2 -41.8 -33 -36.4 -41.8 -36.4 -42.7 -29.8 ...
##  $ euribor3m     : num  0.75 4.859 1.334 4.864 0.706 ...
##  $ nr.employed   : num  5018 5191 5099 5228 5024 ...
##  $ Class         : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 2 ...
colnames(train_bankdata_ds)
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "housing"        "loan"           "contact"        "month"         
##  [9] "day_of_week"    "duration"       "campaign"       "pdays"         
## [13] "previous"       "poutcome"       "emp.var.rate"   "cons.price.idx"
## [17] "cons.conf.idx"  "euribor3m"      "nr.employed"    "Class"

Lasso Feature Selection

dat.train.x  = model.matrix(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,train_bankdata_ds)
#dat.train.x  = model.matrix(Class~.,train_bankdata_ds)
dat.train.y<- train_bankdata_ds[,20]
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
finalmodel<-glmnet(dat.train.x, dat.train.y, family = "binomial",lambda=cvfit$lambda.min)

dat.test.x  = model.matrix(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,test_bankdata_ds)
#dat.test.x  = model.matrix(Class~.,test_bankdata_ds)
fit.pred.lasso <- predict(finalmodel, newx = dat.test.x, type = "response")

Stepwise - Forward, Backward and Both

full.log<-glm(Class~age+campaign+cons.conf.idx+cons.price.idx+contact+day_of_week+education+emp.var.rate+euribor3m+housing+job+loan+marital+month+nr.employed+pdays+poutcome+previous,family="binomial",data=train_bankdata_ds)
#full.log<-glm(Class~.,family="binomial",data=train_bankdata_ds)
forward.step.log<-full.log %>% stepAIC(direction="forward", trace=FALSE)
backward.step.log<-full.log %>% stepAIC(direction="back", trace=FALSE)
both.step.log<-full.log %>% stepAIC(direction="both", trace=FALSE)

exp(cbind("Odds ratio" = coef(forward.step.log), confint.default(forward.step.log, level = 0.95)))
##                                Odds ratio         2.5 %       97.5 %
## (Intercept)                  1.870418e-94 8.647698e-146 4.045543e-43
## age                          1.000128e+00  9.924983e-01 1.007816e+00
## campaign                     9.694594e-01  9.420718e-01 9.976432e-01
## cons.conf.idx                1.009591e+00  9.811927e-01 1.038812e+00
## cons.price.idx               7.549592e+00  3.430317e+00 1.661547e+01
## contacttelephone             4.977718e-01  3.918306e-01 6.323569e-01
## day_of_weekmon               8.329301e-01  6.807647e-01 1.019108e+00
## day_of_weekthu               9.971352e-01  8.182358e-01 1.215149e+00
## day_of_weektue               9.650852e-01  7.901139e-01 1.178804e+00
## day_of_weekwed               1.267813e+00  1.037211e+00 1.549684e+00
## educationbasic.6y            1.318427e+00  9.007611e-01 1.929758e+00
## educationbasic.9y            1.101583e+00  8.130921e-01 1.492434e+00
## educationhigh.school         1.163220e+00  8.622221e-01 1.569295e+00
## educationilliterate          5.240160e+00  4.536076e-01 6.053531e+01
## educationprofessional.course 1.149827e+00  8.273811e-01 1.597937e+00
## educationuniversity.degree   1.371970e+00  1.012983e+00 1.858177e+00
## emp.var.rate                 2.138751e-01  1.363260e-01 3.355382e-01
## euribor3m                    1.351091e+00  8.837960e-01 2.065463e+00
## housingyes                   9.606988e-01  8.481378e-01 1.088198e+00
## jobblue-collar               9.891484e-01  7.786658e-01 1.256527e+00
## jobentrepreneur              1.075043e+00  7.468508e-01 1.547453e+00
## jobhousemaid                 1.011880e+00  6.466381e-01 1.583422e+00
## jobmanagement                9.224340e-01  7.149562e-01 1.190121e+00
## jobretired                   1.472305e+00  1.024983e+00 2.114848e+00
## jobself-employed             1.081434e+00  7.642356e-01 1.530287e+00
## jobservices                  9.859002e-01  7.676488e-01 1.266203e+00
## jobstudent                   1.893147e+00  1.212400e+00 2.956125e+00
## jobtechnician                1.251546e+00  1.011042e+00 1.549261e+00
## jobunemployed                1.280300e+00  8.494467e-01 1.929688e+00
## loanyes                      9.304898e-01  7.845191e-01 1.103620e+00
## maritalmarried               1.103767e+00  9.041222e-01 1.347496e+00
## maritalsingle                1.133466e+00  9.048773e-01 1.419800e+00
## monthaug                     2.098534e+00  1.354230e+00 3.251920e+00
## monthdec                     1.504521e+00  6.777552e-01 3.339825e+00
## monthjul                     1.281441e+00  9.433188e-01 1.740760e+00
## monthjun                     5.776635e-01  3.831440e-01 8.709391e-01
## monthmar                     4.883722e+00  2.808021e+00 8.493790e+00
## monthmay                     7.225746e-01  5.579613e-01 9.357531e-01
## monthnov                     6.934656e-01  4.806058e-01 1.000601e+00
## monthoct                     1.172986e+00  7.110574e-01 1.935001e+00
## monthsep                     1.693977e+00  9.190046e-01 3.122462e+00
## nr.employed                  1.005117e+00  9.954929e-01 1.014834e+00
## pdays                        9.987639e-01  9.977881e-01 9.997407e-01
## poutcomenonexistent          1.516538e+00  1.043937e+00 2.203090e+00
## poutcomesuccess              1.671215e+00  6.285813e-01 4.443274e+00
## previous                     9.525131e-01  7.186014e-01 1.262565e+00
fit.pred.step.forward<-predict(forward.step.log,newdata=test_bankdata_ds,type="response")
fit.pred.step.backward<-predict(backward.step.log,newdata=test_bankdata_ds,type="response")
fit.pred.step.both<-predict(both.step.log,newdata=test_bankdata_ds,type="response")

Find the best cutoff

accuracy_step.forward = c()
accuracy_step.backward = c()
accuracy_step.both = c()
accuracy_lasso = c()

sensitivity_step.forward = c()
sensitivity_step.backward = c()
sensitivity_step.both = c()
sensitivity_lasso = c()

specificity_step.forward = c()
specificity_step.backward = c()
specificity_step.both = c()
specificity_lasso = c()

for (counter in 1:9)
{
    cutoff=0.1 * counter

    class.step.forward<-factor(ifelse(fit.pred.step.forward<cutoff,"no","yes"),levels=c("no","yes"))
    conf.step.forward<-table(class.step.forward,test_bankdata_ds$Class)
    cmval = confusionMatrix(conf.step.forward)
    accuracy_step.forward[counter] = cmval$overall[1]
    sensitivity_step.forward[counter] = cmval$byClass[1]
    specificity_step.forward[counter] = cmval$byClass[2]

    class.step.backward<-factor(ifelse(fit.pred.step.backward<cutoff,"no","yes"),levels=c("no","yes"))
    conf.step.backward<-table(class.step.backward,test_bankdata_ds$Class)
    cmval = confusionMatrix(conf.step.backward)
    accuracy_step.backward[counter] = cmval$overall[1]
    sensitivity_step.backward[counter] = cmval$byClass[1]
    specificity_step.backward[counter] = cmval$byClass[2]
          
    class.step.both<-factor(ifelse(fit.pred.step.both<cutoff,"no","yes"),levels=c("no","yes"))
    conf.step.both<-table(class.step.both,test_bankdata_ds$Class)
    cmval = confusionMatrix(conf.step.both)
    accuracy_step.both[counter] = cmval$overall[1]
    sensitivity_step.both[counter] = cmval$byClass[1]
    specificity_step.both[counter] = cmval$byClass[2]

    #Lasso accuracy
    class.lasso<-factor(ifelse(fit.pred.lasso<cutoff,"no","yes"),levels=c("no","yes"))
    conf.lasso<-table(class.lasso,test_bankdata_ds$Class)
    cmval = confusionMatrix(conf.lasso)
    accuracy_lasso[counter] = cmval$overall[1]
    sensitivity_lasso[counter] = cmval$byClass[1]
    specificity_lasso[counter] = cmval$byClass[2]

}

Plot the graph

plot(accuracy_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Accuracy")
lines(accuracy_step.backward, col="orange",  type="b", lwd=1.5)
lines(accuracy_step.both, col="pink",  type="b", lwd=1.5)
lines(accuracy_lasso, col="blue", type="b", lwd=1.5)

plot(sensitivity_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Sensitivity")
lines(sensitivity_step.backward, col="orange",  type="b", lwd=1.5)
lines(sensitivity_step.both, col="pink",  type="b", lwd=1.5)
lines(sensitivity_lasso, col="blue", type="b", lwd=1.5)

plot(specificity_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", ylab="Specificity")
lines(specificity_step.backward, col="orange",  type="b", lwd=1.5)
lines(specificity_step.both, col="pink",  type="b", lwd=1.5)
lines(specificity_lasso, col="blue",  type="b", lwd=1.5)

Plot the graph by type

par(pch=22, col="blue") # plotting symbol and color
par(mfrow=c(2,4)) # all plots on one page
plot(accuracy_step.forward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Forward")
lines(sensitivity_step.forward, col="orange",  type="b", lwd=1.5)
lines(specificity_step.forward, col="pink",  type="b", lwd=1.5)

plot(accuracy_step.backward, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Backward")
lines(sensitivity_step.backward, col="orange",  type="b", lwd=1.5)
lines(specificity_step.backward, col="pink",  type="b", lwd=1.5)

plot(accuracy_step.both, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Both")
lines(sensitivity_step.both, col="orange",  type="b", lwd=1.5)
lines(specificity_step.both, col="pink",  type="b", lwd=1.5)

plot(accuracy_lasso, type="l", col="red",ylim= c(0,1), xlab = "Cutoff/10", main="Lasso")
lines(sensitivity_lasso, col="orange",  type="b", lwd=1.5)
lines(specificity_lasso, col="pink",  type="b", lwd=1.5)

Features by importance

imp = varImp(forward.step.log, scale=FALSE)
#Plot importance
ggplot2::ggplot(imp, aes(x=reorder(rownames(imp),Overall), y=Overall)) +
geom_point( color="blue", size=4, alpha=0.6)+
geom_segment( aes(x=rownames(imp), xend=rownames(imp), y=0, yend=Overall), 
color='skyblue') +
xlab('Variable')+
ylab('Overall Importance')+
theme_light() +
coord_flip() 

## Goodness of Fit Test Chisq-test is used to compare observed sample distribution with the expected probability distribution. If the p-value is significant then model with categorical predictors have captured the expected probability distribution.

chisq = chisq.test(table(class.step.forward,test_bankdata_ds$Class), p=c(.5, .5))

The p-value is 2.890633910^{-44} which is less than α, which is significant. The logistic regression model has captured the categorical predictors well.

KNN

KNN Analysis of the data.

bank.ds.red1 <- subset(bank.ds, select = -c(duration, pdays, default,day_of_week,cons.conf.idx) )

bank.ds.red2 <- subset(bank.ds, select = -c(age,job, marital, education, duration, pdays, default,day_of_week,cons.conf.idx, nr.employed) )


bank.ds.red3 <- subset(bank.ds, select = -c(age,job, marital, loan, housing, previous, education, duration, pdays, default,day_of_week,cons.conf.idx, nr.employed ) )



KNN.Variables <- bank.ds.red1

# In-order to use the factors for KNN Prediction. Convert Factors to Integer

KNN.Variables$job<-as.integer(KNN.Variables$job)
KNN.Variables$marital<-as.integer(KNN.Variables$marital)
KNN.Variables$education<-as.integer(KNN.Variables$education)
KNN.Variables$housing<-as.integer(KNN.Variables$housing)
KNN.Variables$loan<-as.integer(KNN.Variables$loan)
KNN.Variables$contact<-as.integer(KNN.Variables$contact)
KNN.Variables$month<-as.integer(KNN.Variables$month)
KNN.Variables$poutcome<-as.integer(KNN.Variables$poutcome)

# Normalize all the variables

KNN.DF.N <- data.frame(age = scale(KNN.Variables$age), job = scale(KNN.Variables$job),marital = scale(KNN.Variables$marital),
                       education = scale(KNN.Variables$education), housing = scale(KNN.Variables$housing), loan = scale(KNN.Variables$loan),
                       contact = scale(KNN.Variables$contact), month = scale(KNN.Variables$month), campaign = scale(KNN.Variables$campaign),
                       previous = scale(KNN.Variables$previous), poutcome = scale(KNN.Variables$poutcome), emp.var.rate = scale(KNN.Variables$emp.var.rate),
                       cons.price.idx = scale(KNN.Variables$cons.price.idx), euribor3m = scale(KNN.Variables$euribor3m), nr.employed = scale(KNN.Variables$nr.employed), y=KNN.Variables$y)


# Run 10 iterations  on different train/test sets. We will compute the average accuracy, specificity and Sensitivity.
iterations = 10
numks = 50

masterAcc = matrix(nrow = iterations,ncol = numks)
masterSensitivity = matrix(nrow = iterations,ncol = numks)
masterSpecificity = matrix(nrow = iterations,ncol = numks)
splitPerc = .85 #Training / Test split Percentage
for(j in 1:iterations)
{
  splitPerc = .85
  set.seed(12345)
  trainIndices = sample(1:dim(KNN.DF.N)[1],round(splitPerc * dim(KNN.DF.N)[1]))
  train.KNN.DF = KNN.DF.N[trainIndices,]
  test.KNN.DF = KNN.DF.N[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(train.KNN.DF[,c(1:15)], test.KNN.DF[,c(1:15)],train.KNN.DF$y,prob=TRUE, k=i)
    table(classifications,test.KNN.DF$y)
    CM = confusionMatrix(table(classifications,test.KNN.DF$y))
    masterAcc[j,i] = CM$overall[1]
    masterSensitivity[j,i] = CM$byClass[1]
    masterSpecificity[j,i] = CM$byClass[2]
  }
  
  MeanAcc = colMeans(masterAcc)
  MeanSensitivity = colMeans(masterSensitivity)
  MeanSpecificity = colMeans(masterSpecificity)
  masterAcc[j] = CM$overall[1]
  masterSensitivity[j] = CM$byClass[1]
  masterSpecificity[j] = CM$byClass[2]
}
plot(seq(1,numks,1),MeanAcc, type = "l", main = "Plot of value of K vs. Accuracy", xlab = "Value of K", ylab="Accuracy %")

# Best K is at 45 with 73.76% Accuracy, Sensitivity is 78.20% and Specificity is 69.33%

MeanAcc = mean(colMeans(masterAcc))
MeanSpecificity = mean(colMeans(masterSpecificity))
MeanSensitivity = mean(colMeans(masterSensitivity))

print(paste("Best Accuracy at K =  ", which.max(colMeans(masterAcc)), "of", max(colMeans(masterAcc))*100,"%"))
## [1] "Best Accuracy at K =   42 of 74.3523316062176 %"
print(paste("Mean Accuracy = ", MeanAcc*100,"%"))
## [1] "Mean Accuracy =  72.3851468048359 %"
print(paste("Mean Sensitivity = ", MeanSensitivity*100,"%"))
## [1] "Mean Sensitivity =  77.1205673758865 %"
print(paste("Mean Specificity = ", MeanSpecificity*100,"%"))
## [1] "Mean Specificity =  67.8888888888889 %"
# Run KNN Model for K=41
classifications.K41 = knn(train.KNN.DF[,c(1:15)], test.KNN.DF[,c(1:15)],train.KNN.DF$y,prob=TRUE, k=41)
#table(classifications.K41,test.AT.DF3$Attrition)
CM.K41 = confusionMatrix(table(classifications.K41,test.KNN.DF$y))

print(paste("Accuracy for K-41 = ", CM.K41$overall[1]*100,"%"))
## [1] "Accuracy for K-41 =  74.006908462867 %"
print(paste("Sensitivity for K-41 = ", CM.K41$byClass[1]*100,"%"))
## [1] "Sensitivity for K-41 =  79.2553191489362 %"
print(paste("Specificity for K-41 = ", CM.K41$byClass[2]*100,"%"))
## [1] "Specificity for K-41 =  69.023569023569 %"

KNN With Only Continuous Data

# KNN with ONLY CONTINOUS VARIABLE Marketing Data===

KNN.Variables <- bank.Cont


# Normalize all the variables

KNN.DF.N <- data.frame(age = scale(KNN.Variables$age),campaign = scale(KNN.Variables$campaign), pdays=scale(KNN.Variables$pdays), emp.var.rate = scale(KNN.Variables$emp.var.rate),
                       cons.price.idx = scale(KNN.Variables$cons.price.idx), cons.conf.idx = scale(KNN.Variables$cons.conf.idx), euribor3m = scale(KNN.Variables$euribor3m), 
                       nr.employed = scale(KNN.Variables$nr.employed), y=bank.ds$y)


# Run 10 iterations  on different train/test sets. We will compute the average accuracy, specificity and Sensitivity.
iterations = 10
numks = 50

masterAcc = matrix(nrow = iterations,ncol = numks)
masterSensitivity = matrix(nrow = iterations,ncol = numks)
masterSpecificity = matrix(nrow = iterations,ncol = numks)
splitPerc = .85 #Training / Test split Percentage
for(j in 1:iterations)
{
  splitPerc = .85
  set.seed(12345)
  trainIndices = sample(1:dim(KNN.DF.N)[1],round(splitPerc * dim(KNN.DF.N)[1]))
  train.KNN.DF = KNN.DF.N[trainIndices,]
  test.KNN.DF = KNN.DF.N[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(train.KNN.DF[,c(1:8)], test.KNN.DF[,c(1:8)],train.KNN.DF$y,prob=TRUE, k=i)
    table(classifications,test.KNN.DF$y)
    CM = confusionMatrix(table(classifications,test.KNN.DF$y))
    masterAcc[j,i] = CM$overall[1]
    masterSensitivity[j,i] = CM$byClass[1]
    masterSpecificity[j,i] = CM$byClass[2]
  }
  
  MeanAcc = colMeans(masterAcc)
  MeanSensitivity = colMeans(masterSensitivity)
  MeanSpecificity = colMeans(masterSpecificity)
  masterAcc[j] = CM$overall[1]
  masterSensitivity[j] = CM$byClass[1]
  masterSpecificity[j] = CM$byClass[2]
}
plot(seq(1,numks,1),MeanAcc, type = "l", main = "Plot of value of K vs. Accuracy", xlab = "Value of K", ylab="Accuracy %")

#which.max(MeanAcc)
#max(MeanSensitivity)
#max(MeanSpecificity)

# Best K is at 20 with 73.83% Accuracy, Sensitivity is 78.72% and Specificity is 69.19%

MeanAcc = mean(colMeans(masterAcc))
MeanSpecificity = mean(colMeans(masterSpecificity))
MeanSensitivity = mean(colMeans(masterSensitivity))

print(paste("Best Accuracy at K =  ", which.max(colMeans(masterAcc)), "of", max(colMeans(masterAcc))*100,"%"))
## [1] "Best Accuracy at K =   45 of 74.6977547495682 %"
print(paste("Mean Accuracy = ", MeanAcc*100,"%"))
## [1] "Mean Accuracy =  73.5164075993091 %"
print(paste("Mean Sensitivity = ", MeanSensitivity*100,"%"))
## [1] "Mean Sensitivity =  77.4893617021277 %"
print(paste("Mean Specificity = ", MeanSpecificity*100,"%"))
## [1] "Mean Specificity =  69.7441077441077 %"
# Run KNN Model for K=20
classifications.K20 = knn(train.KNN.DF[,c(1:8)], test.KNN.DF[,c(1:8)],train.KNN.DF$y,prob=TRUE, k=20)
#table(classifications.K41,test.AT.DF3$Attrition)
CM.K20 = confusionMatrix(table(classifications.K20,test.KNN.DF$y))

print(paste("Accuracy for K-20 = ", CM.K41$overall[1]*100,"%"))
## [1] "Accuracy for K-20 =  74.006908462867 %"
print(paste("Sensitivity for K-20 = ", CM.K41$byClass[1]*100,"%"))
## [1] "Sensitivity for K-20 =  79.2553191489362 %"
print(paste("Specificity for K-20 = ", CM.K41$byClass[2]*100,"%"))
## [1] "Specificity for K-20 =  69.023569023569 %"

Random Forest

df = bank_additional
#df <- subset(df, select = c(y, age, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed, default, housing, loan, poutcome))
df <- subset(df, select = c(y, age, euribor3m))

df[c("y")]<- lapply(df[c("y")], factor)
#df[c("default")]<- lapply(df[c("default")], factor)
#df[c("housing")]<- lapply(df[c("housing")], factor)
#df[c("loan")]<- lapply(df[c("loan")], factor)
#df[c("poutcome")]<- lapply(df[c("poutcome")], factor)

dim(df) #rows and columns
## [1] 41188     3
df_clean = df
set.seed(12345)

df_clean <- df_clean[!grepl('unknown', df_clean$default),]
df_clean <- df_clean[!grepl('unknown', df_clean$housing),]
df_clean <- df_clean[!grepl('unknown', df_clean$loan),]


#df = df_clean

#normalize data
df_norm = normalize(df)

summary(df_norm)
##    y              age           euribor3m     
##  no :36548   Min.   :0.0000   Min.   :0.0000  
##  yes: 4640   1st Qu.:0.1852   1st Qu.:0.1610  
##              Median :0.2593   Median :0.9574  
##              Mean   :0.2842   Mean   :0.6772  
##              3rd Qu.:0.3704   3rd Qu.:0.9810  
##              Max.   :1.0000   Max.   :1.0000
sample = sample.split(df_norm$y, SplitRatio = .75)
train = subset(df_norm, sample == TRUE)
test  = subset(df_norm, sample == FALSE)
dim(train)
## [1] 30891     3
dim(test)
## [1] 10297     3
rf <- randomForest(
  y ~ .,
  data=train
)
importance(rf)
##           MeanDecreaseGini
## age               640.9315
## euribor3m        1697.0997
pred = predict(rf, newdata=test[-1])
cm = table(test[,1], pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##      pred
##         no  yes
##   no  8940  197
##   yes  908  252
##                                           
##                Accuracy : 0.8927          
##                  95% CI : (0.8865, 0.8986)
##     No Information Rate : 0.9564          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2672          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9078          
##             Specificity : 0.5612          
##          Pos Pred Value : 0.9784          
##          Neg Pred Value : 0.2172          
##              Prevalence : 0.9564          
##          Detection Rate : 0.8682          
##    Detection Prevalence : 0.8873          
##       Balanced Accuracy : 0.7345          
##                                           
##        'Positive' Class : no              
## 
plot(pred , test$y,main="Bagged Model",xlab="Predicted",ylab="Test Set Y")
abline (0,1)

LDA

LDA is not a good tool when there are categorical predictors and response variables. However the analysis here.

## Call:
## lda(Class ~ ., data = train_bankdata_ds)
## 
## Prior probabilities of groups:
##        no       yes 
## 0.5024188 0.4975812 
## 
## Group means:
##          age jobblue-collar jobentrepreneur jobhousemaid jobmanagement
## no  38.87586      0.1977304      0.03473177   0.02372765     0.0801238
## yes 40.38646      0.1177083      0.02569444   0.02361111     0.0750000
##     jobretired jobself-employed jobservices jobstudent jobtechnician
## no  0.03370014       0.03473177  0.10281981 0.01203576     0.1609354
## yes 0.09444444       0.03437500  0.06666667 0.05277778     0.1649306
##     jobunemployed maritalmarried maritalsingle educationbasic.6y
## no     0.02097662      0.5773728     0.2940165        0.04779917
## yes    0.03437500      0.5295139     0.3618056        0.03784722
##     educationbasic.9y educationhigh.school educationilliterate
## no         0.14993122            0.2678817         0.000343879
## yes        0.09826389            0.2427083         0.001041667
##     educationprofessional.course educationuniversity.degree housingyes
## no                     0.1313618                  0.3290922  0.5381706
## yes                    0.1378472                  0.3993056  0.5600694
##       loanyes contacttelephone  monthaug   monthdec  monthjul  monthjun
## no  0.1619670        0.3559147 0.1537139 0.00343879 0.1657497 0.1234525
## yes 0.1510417        0.1444444 0.1399306 0.01909722 0.1295139 0.1218750
##        monthmar  monthmay   monthnov   monthoct    monthsep day_of_weekmon
## no  0.007909216 0.3342503 0.12379642 0.01581843 0.009628611      0.2107978
## yes 0.059722222 0.1791667 0.09791667 0.07361111 0.061458333      0.1815972
##     day_of_weekthu day_of_weektue day_of_weekwed duration campaign    pdays
## no       0.2070151      0.2077029      0.1887895 223.6228 2.575997 980.8982
## yes      0.2263889      0.2093750      0.2055556 524.0330 2.023611 784.1878
##      previous poutcomenonexistent poutcomesuccess emp.var.rate cons.price.idx
## no  0.1444292           0.8758597      0.01616231   0.09920908       93.54671
## yes 0.4975694           0.6684028      0.20069444  -1.39652778       93.31208
##     cons.conf.idx euribor3m nr.employed
## no      -40.66967  3.667028    5171.081
## yes     -39.80458  1.961340    5088.038
## 
## Coefficients of linear discriminants:
##                                        LD1
## age                          -0.0011902324
## jobblue-collar               -0.0888352316
## jobentrepreneur              -0.0262217669
## jobhousemaid                 -0.0102036344
## jobmanagement                -0.0596353556
## jobretired                    0.2695045085
## jobself-employed             -0.0454606362
## jobservices                  -0.0244081974
## jobstudent                    0.3791369431
## jobtechnician                 0.1329959384
## jobunemployed                 0.2260065299
## maritalmarried                0.0943189562
## maritalsingle                 0.1236337792
## educationbasic.6y             0.0335797703
## educationbasic.9y             0.0038749005
## educationhigh.school         -0.0132030059
## educationilliterate           1.1228617033
## educationprofessional.course  0.0382195325
## educationuniversity.degree    0.1733471306
## housingyes                    0.0077398325
## loanyes                      -0.1025505459
## contacttelephone             -0.3367896764
## monthaug                      0.5218654575
## monthdec                     -0.1225151835
## monthjul                      0.0002390292
## monthjun                     -0.3975591964
## monthmar                      1.2659485471
## monthmay                     -0.4362814431
## monthnov                     -0.4384783281
## monthoct                      0.0104165824
## monthsep                      0.2526261124
## day_of_weekmon               -0.0666560394
## day_of_weekthu                0.0007497524
## day_of_weektue               -0.0017921098
## day_of_weekwed                0.1047784335
## duration                      0.0027459462
## campaign                     -0.0092834282
## pdays                        -0.0006739991
## previous                      0.0002493134
## poutcomenonexistent           0.3087063111
## poutcomesuccess               0.2295405552
## emp.var.rate                 -1.3064584484
## cons.price.idx                1.2819712430
## cons.conf.idx                 0.0011031900
## euribor3m                     0.4826144524
## nr.employed                   0.0001728640
##  no yes 
## 988 942
## Confusion Matrix and Statistics
## 
##      ldap
##        no yes
##   no  825 126
##   yes 163 816
##                                           
##                Accuracy : 0.8503          
##                  95% CI : (0.8336, 0.8659)
##     No Information Rate : 0.5119          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7006          
##                                           
##  Mcnemar's Test P-Value : 0.0342          
##                                           
##             Sensitivity : 0.8350          
##             Specificity : 0.8662          
##          Pos Pred Value : 0.8675          
##          Neg Pred Value : 0.8335          
##              Prevalence : 0.5119          
##          Detection Rate : 0.4275          
##    Detection Prevalence : 0.4927          
##       Balanced Accuracy : 0.8506          
##                                           
##        'Positive' Class : no              
## 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(test_bankdata_ds$Class, ldap)
## X-squared = 945.97, df = 1, p-value < 2.2e-16